# - # Name: module # Purpose: # # Author: Castel # # Created: / / # Copyright: (c) Castel # Licence: # - import os import regex import re import random import unidecode import nltk from nltk import tokenize # nltk download('punkt') import requests import regex regex as regex SITE = 'https://neculaifantanaru com/' LISTA CUVINTE LEGATURA = [ 'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a', 'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine', 'ale', 'sau', 'dintre', 'intre', 'cu', 'ce', 'va', 'fi', 'este', 'cand', 'o', 'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'asta', 'pe', 'tu', 'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti', 'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori', 'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an', 'mi', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'made', 'my', 'me', 'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l', 'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he', 'no', 'too', 'fac', 'made', 'make', 'cei', 'most', 'face', 'pentru', 'cat', 'cate', 'much', 'more', 'many', 'sale', 'tale', 'tau', 'has', 'sunt', 'his', 'yours', 'only', 'as', 'toate', 'all', 'tot', 'incat', 'which', 'ti', 'asa', 'like', 'these', 'because', 'unor', 'caci', 'ele', 'have', 'haven', 'te', 'cea', 'else', 'imi', 'iti', 'should', 'could', 'not', 'even', 'chiar', 'when', 'ci', 'ne', 'ni', 'her', 'our', 'alta', 'another', 'other', 'decat', 'acelasi', 'same', 'au', 'had', 'haven', 'hasn', 'alte', 'alt', 'others', 'ceea', 'cel', 'cele', 'alte', 'despre', 'about', 'acele', 'acel', 'acea', 'decit', 'with', ' ', 'fata', 'towards', 'against', 'cind', 'dinspre', 'fost', 'been', 'era', 'daca', 'eu', 'el', 'him', 'ea', 'will', 'am', 'cannot', 'between', 'cause', 'may', 'couldn', 'IN', 'LA', 'UNEI', 'SI', 'SA', 'SE', 'DE', 'PRIN', 'UNDE', 'CARE', 'A', 'AL', 'PREA', 'LUI', 'DIN', 'AI', 'UNUI', 'ACEI', 'UN', 'DOAR', 'TINE', 'ALE', 'SAU', 'DINTRE', 'INTRE', 'CU', 'CE', 'VA', 'FI', 'ESTE', 'CAND', 'O', 'CINE', 'ACEASTA', 'CA', 'DAR', 'TO', 'WAS', 'YOUR', 'YOU', 'IS', 'ARE', 'IAR', 'FARA', 'ASTA', 'PE', 'TU', 'NU', 'MAI', 'NE', 'LE', 'INTR', 'CUM', 'E', 'FOR', 'SHE', 'IT', 'ESTI', 'THIS', 'THAT', 'HOW', 'CAN', 'T', 'MUST', 'BE', 'THE', 'AND', 'DO', 'SO', 'OR', 'ORI', 'WHO', 'WHAT', 'IF', 'OF', 'ON', 'I', 'WE', 'THEY', 'THEM', 'BUT', 'WHERE', 'BY', 'AN', 'MI', 'MADE', 'MY', 'ME', 'VOM', 'VOI', 'EI', 'CAT', 'AR', 'PUTEA', 'POTI', 'SUNTETI', 'INCA', 'STILL', 'NOI', 'L', 'MA', 'S', 'DUPA', 'AFTER', 'UNDER', 'SUB', 'NISTE', 'SOME', 'THOSE', 'HE', 'NO', 'TOO', 'FAC', 'MADE', 'MAKE', 'CEI', 'MOST', 'FACE', 'PENTRU', 'CAT', 'CATE', 'MUCH', 'MORE', 'MANY', 'SALE', 'TALE', 'TAU', 'HAS', 'SUNT', 'HIS', 'YOURS', 'ONLY', 'AS', 'TOATE', 'ALL', 'TOT', 'INCAT', 'WHICH', 'TI', 'ASA', 'LIKE', 'THESE', 'BECAUSE', 'UNOR', 'CACI', 'ELE', 'HAVE', 'HAVEN', 'TE', 'CEA', 'ELSE', 'IMI', 'ITI', 'SHOULD', 'COULD', 'NOT', 'EVEN', 'CHIAR', 'WHEN', 'CI', 'NE', 'NI', 'HER', 'OUR', 'ALTA', 'ANOTHER', 'OTHER', 'DECAT', 'ACELASI', 'SAME', 'AU', 'HAD', 'HAVEN', 'HASN', 'ALTE', 'ALT', 'OTHERS', 'CEEA', 'CEL', 'CELE', 'ALTE', 'DESPRE', 'ABOUT', 'ACELE', 'ACEL', 'ACEA', 'DECIT', 'WITH', ' ', 'FATA', 'TOWARDS', 'AGAINST', 'CIND', 'DINSPRE', 'FOST', 'BEEN', 'ERA', 'DACA', 'EU', 'EL', 'HIM', 'EA', 'WILL', 'AM', 'CANNOT', 'BETWEEN', 'CAUSE', 'MAY', 'COULDN', 'In', 'La', 'Unei', 'Si', 'Sa', 'Se', 'De', 'Prin', 'Unde', 'Care', 'Al', 'Prea', 'Lui', 'Din', 'Ai', 'Unui', 'Acei', 'Un', 'Doar', 'Tine', 'Ale', 'Sau', 'Dintre', 'Intre', 'Cu', 'Ce', 'Va', 'Fi', 'Este', 'Cand', 'Cine', 'Aceasta', 'Ca', 'Dar', 'Ii', 'Iii', 'Iv', 'V', 'Vi', 'Vii', 'Viii', 'To', 'Was', 'Your', 'You', 'Is', 'Are', 'Iar', 'Fara', 'Asta', 'Pe', 'Tu', 'Nu', 'Mai', 'Ne', 'Le', 'Intr', 'Cum', 'For', 'She', 'It', 'Esti', 'This', 'That', 'How', 'Can', 'Must', 'Be', 'The', 'And', 'Do', 'So', 'Or', 'Ori', 'Who', 'What', 'If', 'Of', 'On', 'We', 'They', 'Them', 'But', 'Where', 'By', 'An', 'Mi', 'Made', 'My', 'Me', 'Vom', 'Voi', 'Ei', 'Cat', 'Ar', 'Putea', 'Poti', 'Sunteti', 'Inca', 'Still', 'Noi', 'Ma', 'Dupa', 'After', 'Under', 'Sub', 'Niste', 'Some', 'Those', 'He', 'No', 'Too', 'Fac', 'Made', 'Make', 'Cei', 'Most', 'Face', 'Pentru', 'Cat', 'Cate', 'Much', 'More', 'Many', 'Sale', 'Tale', 'Tau', 'Has', 'Sunt', 'His', 'Yours', 'Only', 'As', 'Toate', 'All', 'Tot', 'Incat', 'Which', 'Ti', 'Asa', 'Like', 'These', 'Because', 'Unor', 'Caci', 'Ele', 'Have', 'Haven', 'Te', 'Cea', 'Else', 'Imi', 'Iti', 'Should', 'Could', 'Not', 'Even', 'Chiar', 'When', 'Ci', 'Ne', 'Ni', 'Her', 'Our', 'Alta', 'Another', 'Other', 'Decat', 'Acelasi', 'Same', 'Au', 'Had', 'Haven', 'Hasn', 'Alte', 'Alt', 'Others', 'Ceea', 'Cel', 'Cele', 'Alte', 'Despre', 'About', 'Acele', 'Acel', 'Acea', 'Decit', 'With', 'Fata', 'Towards', 'Against', 'Cind', 'Dinspre', 'Fost', 'Been', 'Era', 'Daca', 'Eu', 'El', 'Him', 'Ea', 'Will', 'Am', 'Cannot', 'Between', 'Cause', 'May', 'Couldn', 'destul', 'enough', 'Destul', 'Enough', 'from', 'FROM', 'From', 'ia', 'Ia', 'IA' ] #PATTERN LINK = " {} " PATTERN LINK = " {} " ''' structura dictionar cuvinte { "cuvantul ": [lista linkuri ], "cuvantul ": [lista linkuri ] } ''' CALE FISIER LINKURI = "d:\\Folder \\LINKS\\links txt" # folosim DEF cand vrem sa definim o functie => un cuvant cheie in Python # REGULA: def nume functie(lista argumente) def preia cuvinte link(link): cuvinte = link split(' ')[ ] # [ ] ia primul element iar daca pun [ ] ia al doilea element cuvinte = cuvinte split('-') cuvinte ok = list() for cuv in cuvinte: if cuv not in LISTA CUVINTE LEGATURA: cuvinte ok append(cuv) return cuvinte ok # am pus return fiindca voi avea nevoie de rezultatul functiei de mai sus def preia cuvinte lista linkuri(cale fisier linkuri): dictionar cuvinte linkuri = dict() with open(cale fisier linkuri, encoding='utf ') as fp: lines = fp readlines() for line in lines: # functia preia cuvinte link returneaza un rezultat care este salvat in variabila cuvinte link cuvinte link = preia cuvinte link(line strip()) for cuv in cuvinte link: if cuv in dictionar cuvinte linkuri keys(): if SITE + line strip() not in dictionar cuvinte linkuri[cuv]: dictionar cuvinte linkuri[cuv] append(SITE + line strip()) else: dictionar cuvinte linkuri[cuv] = [SITE + line strip()] return dictionar cuvinte linkuri def citeste fisier linie cu linie(cale fisier): with open(cale fisier, encoding='utf ') as fp: lines = fp readlines() count = for line in lines: print(count, line strip()) count += def read text from file(file path): """ Aceasta functie returneaza continutul unui fisier file path: calea catre fisierul din care vrei sa citesti """ # with open(file path, encoding='utf ') as f: with open(file path, encoding='utf ', errors='ignore') as f: text = f read() return text def write to file(text, file path): """ Aceasta functie scrie un text intr-un fisier text: textul pe care vrei sa il scrii file path: calea catre fisierul in care vrei sa scrii """ with open(file path, 'wb') as f: f write(text encode('utf ', 'ignore')) def introducere linkuri(page, paragrafe): tag = " {} " text start final = "" LINK INTRODUS = # incepem de la a doua treime din text # start paragraf = # prima optiune start paragraf = int(len(paragrafe) / ) for paragraf in paragrafe[:start paragraf]: if len(re findall(r'( )([\s\S]*?)( )', paragraf)) != : paragraf = re sub(r'( )([\s\S]*?)( )', r'\ ', paragraf) if len(re findall(r'\n+', paragraf)) != : # print(paragraf) paragraf = re sub(r'\n+', r'', paragraf) text start final = text start final + '\n' + tag format(paragraf) for paragraf in paragrafe[start paragraf:]: if len(re findall(r'( )([\s\S]*?)( )', paragraf)) != : paragraf = re sub(r'( )([\s\S]*?)( )', r'\ ', paragraf) lista cuvinte gasite = list() if LINK INTRODUS == : # gasim toate cuvintele din paragraful curent cuvinte = re findall(r' (?:\w|-*\!)+[ ,]', paragraf) dictionar linkuri = preia cuvinte lista linkuri(CALE FISIER LINKURI) for cuv in cuvinte: cuv fara semne = cuv replace(' ', '') replace(',', '') if cuv fara semne in dictionar linkuri keys(): lista cuvinte gasite append(cuv) # lista de cuvinte gasite in paragraf, dar care se gasesc si in dictionar lista cuvinte gasite = list(set(lista cuvinte gasite)) # daca s-au gasit cuvinte in paragraf, atunci adaugam link-ul in paragraf if len(lista cuvinte gasite) > : cuvant random = random sample(lista cuvinte gasite, )[ ] cuvant random fara semne = cuvant random replace(' ', '') replace(',', '') link random = random sample(dictionar linkuri[cuvant random fara semne], )[ ] # singur cuvant subliniat ''' pattern = PATTERN LINK format(link random, cuvant random strip()) paragraf = paragraf replace(cuvant random strip(), pattern, ) LINK INTRODUS = ''' # doua cuvinte subliniate expresie regulata = cuvant random strip() + r' *\w+' # print("expr: ", expresie regulata) urmatorul cuvant = re findall(expresie regulata, paragraf) if len(urmatorul cuvant) == : print("Nu am gasit urmatorul cuvant pe pagina {}!!!" format(page)) pattern = PATTERN LINK format(link random, cuvant random strip()) paragraf = paragraf replace(cuvant random strip(), pattern, ) LINK INTRODUS = else: urmatorul cuvant = re findall(expresie regulata, paragraf)[ ] pattern = PATTERN LINK format(link random, urmatorul cuvant) paragraf = paragraf replace(urmatorul cuvant, pattern, ) LINK INTRODUS = paragraf = tag format(paragraf) if len(re findall(r'\n+', paragraf)) != : paragraphs[i] = re sub(r'\n+', r'', paragraf) text start final = text start final + '\n' + paragraf if LINK INTRODUS == : print("Nu am introdus niciun link-ul pe pagina: {} " format(page)) return text start final # Preluare site-uri de pe o anumita pagina (vezi variabila PAGE) FOLDER LOCAL = 'd:/Folder /fisiere html/de-convertit/ ' # aici pui fisierele HTML - # DUPA RULAREA CODULUI, SALVAREA NOILOR FISIERE VA AVEA LOC IN d:\Folder \fisiere html modificate\ # INCEPUT CHATGPT page text pattern = re compile(' ([\s\S]*?) ') # AICI PUI PRIMA LEGATURA DINTRE START si FINAL img tag pattern = re compile(' ') # Acesta este regexul care va căuta toate tagurile html tag pattern = re compile(' ]+>') # Acesta este noul pattern pentru eliminarea tagurilor HTML link replace pattern = re compile(' ') # pentru înlocuirea tagului script pattern = re compile('if\(typeof ez ad units!=\'undefined\'\){ez ad units push\(\[\[ , ],\'drawandpaintforfun com-box- \',\'ezslot \', ,\' \',\' \'\]\]\); ez fad position\(\'div-gpt-ad-drawandpaintforfun com-box- - \'\);}') page text pattern = re compile(' ([\s\S]*?) Compartir ') page text pattern = re compile(' ([\s\S]*?) ') page text pattern = re compile(' ([\s\S]*?) ') paragraph pattern = re compile(' ([\s\S]*?) ') counter sterse = INFORMATII PAGINI = list() for f in os listdir(FOLDER LOCAL): if f endswith(' html') or f endswith(' htm'): filepath = os path join(FOLDER LOCAL, f) page html = read text from file(filepath) # Înlocuim tagul cu page html = re sub(link replace pattern, ' " />', page html) # La acest punct, toate tagurile vor fi înlocuite cu un punct page html = re sub(img tag pattern, ' ', page html) # La acest punct, scriptul specificat va fi înlocuit cu un punct page html = re sub(script pattern, ' ', page html) # Obținem textul paginii page text = re findall(page text pattern, page html) page text = re findall(page text pattern , page html) page text = re findall(page text pattern , page html) page text = re findall(page text pattern , page html) if len(page text) != or len(page text ) != or len(page text ) != or len(page text ) != : if len(page text) != : page text = page text[ ] elif len(page text ) != : page text = page text [ ] elif len(page text ) != : page text = page text [ ] else: page text = page text [ ] # La acest punct avem 'page text' care este textul din div-ul selectat # Acum vom înlocui toate tagurile HTML cu un punct page text no html = re sub(html tag pattern, ' ', page text) # FINAL CHAPGPT # inlocuim textul de tipul "text " cu " text " page text = re sub(r'(^ *)(?! )([\s\S]*?)( )', r' \ \ \ ', page text) # extragem paragrafele si construim un text nou paragraphs = re findall(paragraph pattern, page text) new paragraphs = list() if len(paragraphs) == : continue else: for i in range(len(paragraphs)): if ' ' in paragraphs[i]: if len(re findall(r'^( *?)( )', paragraphs[i], flags=re MULTILINE)) != : paragraphs[i] = re sub(r'^( *?)( )', r'\ ', paragraphs[i], flags=re MULTILINE) if len(re findall(r'(^ *)(?! )( +?)( )', paragraphs[i], flags=re MULTILINE)) != : paragraphs[i] = re sub(r'(^ *)(?! )( +?)( )', r' \ \ \ ', paragraphs[i], flags=re MULTILINE) if len(re findall(r'^(?! )( +?)', paragraphs[i], flags=re MULTILINE)) != : paragraphs[i] = re sub(r'^(?! )( *)', r' \ ', paragraphs[i], flags=re MULTILINE) if len(re findall(paragraph pattern, paragraphs[i])) != : for p in re findall(paragraph pattern, paragraphs[i]): new paragraphs append(p) else: new paragraphs append(paragraphs[i]) # introducem link-urile new page text = introducere linkuri(f, new paragraphs) title pattern = re compile(' ([\s\S]*?) ') title og pattern = re compile(' ') if len(re findall(title pattern, page html)) != or len(re findall(title og pattern, page html)) != : if len(re findall(title pattern, page html)) != : page title = re findall(title pattern, page html) else: page title = re findall(title og pattern, page html) page title = page title[ ] # modificare cuvinte titlu title words = page title split(' ') new title words = list() for w in title words: if w isupper(): new title words append(w lower() capitalize()) else: new title words append(w) page title = " " join(new title words) # description description pattern = re compile(' ') description og pattern = re compile(' ') text description = 'MANCARE' if len(re findall(description pattern, page html)) == and len(re findall(description og pattern, page html)) == : print("AM STERS") os remove(filepath) counter sterse += continue elif len(re findall(description pattern, page html)) != and len(re findall(description og pattern, page html)) == : print("CAZ ") text description = re findall(description pattern, page html) text description = text description[ ] description model = ' ' format(text description) og description model = ' ' format(text description) page html = re sub(r'( )', description model + '\n' + og description model, page html) elif len(re findall(description pattern, page html)) == and len(re findall(description og pattern, page html)) != : print("CAZ II") text description = re findall(description og pattern, page html) text description = text description[ ] description model = ' ' format(text description) og description model = ' ' format(text description) page html = re sub(r'( )', description model + '\n' + og description model, page html) else: print("CAZ III") text description = re findall(description pattern, page html) text description = text description[ ] print("DESCCC: ", text description) print(page title) print(f) # canonical canonical tag pattern = re compile(' ') canonical og tag pattern = re compile(' ') if len(re findall(canonical tag pattern, page html)) != or len(re findall(canonical og tag pattern, page html)) != : if len(re findall(canonical tag pattern, page html)) != : canonical tag = re findall(canonical tag pattern, page html) else: canonical tag = re findall(canonical og tag pattern, page html) canonical tag = canonical tag[ ] # print("canonical: ", canonical tag) # adaugare nota finala link pattern = ' {} ' format(canonical tag, canonical tag) new page text = new page text + '\n' + ' * Surs&# ;: {} ' format(link pattern) + '\n' # adaugare informatie informatie = (page title, text description, canonical tag, new page text) INFORMATII PAGINI append(informatie) else: print("Pagina structura gresita - canonical: ", f) continue else: print("Pagina structura gresita - title: ", f) continue else: print("Pagina structura gresita - text: ", f) continue # page are structura: (page title, page description, canonical tag, new page text) def copiaza continut txt html(page, cale fisier html): # astea sunt argumentele functiei, adica cand apelez functia text html = read text from file(cale fisier html) # aici e pattern-ul pentru expresia regex; ( *?) inseamna ca preia tot ce este intre tag-uri # modifici expresia regulata in functie de ce tag dai ca argument pentru functie articol pattern = re compile(' ([\s\S]*?) [\s\S]*?') text articol = re findall(articol pattern, text html) if len(text articol) != : text articol = text articol[ ] text html = text html replace(text articol, page[ ]) # pe indexul sta new page text else: print("Fisier html fara ARTICOL START/FINAL ") title pattern = re compile(' ([\s\S]*?) ') text title = re findall(title pattern, text html) # : inlocuire h cu text titlu ( ) - Aici SCHIMBI TAGUL LA TITLUL ARTICOLULUI DIN PAGINA h pattern = re compile(' ( *?) ') text h = re findall(h pattern, text html) if len(text title) != : text title = text title[ ] canonical words = '' if page[ ] endswith('/'): canonical words = page[ ] split('/')[- ] else: print("PAGE: ", page[ ]) canonical words = page[ ] split('/')[- ] # daca nu se termina cu / sau html, atunci sa imi ia ultima bucata dupa / if ' ' in canonical words: # in cazul in care se termina cu html, htm canonical words = canonical words split(' ')[ ] # creare nume nou link new file name fara spatiu = canonical words + ' html' # inlocuire text titlu cu primele cuvinte text html = text html replace(text title, page[ ]) # page[ ] titlul # : inlocuire h cu text titlu ( ) if len(text h ) != : text h = text h [ ] text html = text html replace(text h , page[ ]) else: print("Fisierul nu are tag-ul h ") # : inlocuire text canonical tag canonical tag pattern = re compile(' ') canonical tag = re findall(canonical tag pattern, text html) if len(canonical tag) != : canonical tag = canonical tag[ ] #text html = text html replace(canonical tag, new file name fara spatiu) # daca trebuie sa pui si "https://neculaifantanaru com/" in fata comentezi linia de mai sus si o decomentezi pe cea de jos text html = text html replace(canonical tag, "https://trinketbox ro/" + new file name fara spatiu) else: print("Fisier fara tag canonical") else: print("Fisier html fara titlu ") description pattern = re compile(' ') text description = re findall(description pattern, text html) if len(text description) != : text description = text description[ ] # print("text description: ", text description) text html = text html replace(text description, page[ ]) # description pe pozitia else: print("Fisier html fara description ") file path = os path dirname(cale fisier html) + "\\" + "fisiere html modificate" + "\\" + new file name fara spatiu # in acest folder se duc fisierele FACUTE write to file(text html, file path) # print("Fisier: ", new file name fara spatiu) print("Scriere efectuata cu succes ") def creare fisiere html(cale fisier html): """ Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare """ count = for page in INFORMATII PAGINI: copiaza continut txt html(page, cale fisier html) count += print("Numarul de fisiere modificate: ", count) def main(): creare fisiere html("d:\\Folder \\index trinketbox html") # aici este indexul model de la trinketbox ro # DUPA RULAREA CODULUI, SALVAREA NOILOR FISIERE VA AVEA LOC IN d:\Folder \fisiere html modificate\ # dictionar cuvinte = preia cuvinte lista linkuri(CALE FISIER LINKURI) # print(dictionar cuvinte) if name == ' main ': main() https://neculaifantanaru com https://neculaifantanaru com/en/